This is an R Markdown document containing code and plots for Abumazen et al. “Nasopharyngeal metatranscriptomics reveals host-pathogen signatures of pediatric sinusitis”.
library(reshape2)
library(pheatmap)
library(pROC)
library(RColorBrewer)
library(dplyr)
library(plyr)
library(vioplot)
library(Metrics)
library(DESeq2)
library(tximport)
library(EnhancedVolcano)
library(patchwork)
library(knitr)
library(vegan)
library(ggplot2)
Reading in the kraken2 data
tb = read.delim("d1-d5_fastp_dehost_krakenmerged_reads_normalizedPlus14PosControls.txt",sep='\t',col.names=(c("X","Y","Z")))
matr = dcast(tb,Y ~ X,value.var = "Z")
rownames(matr) = matr[,1]
matr = matr[,-1]
for (i in 1:nrow(matr))
{ matr[i,which(is.na(matr[i,]))] = 0
}
matr = t(data.matrix(matr))
splitrownames = vector(length = nrow(matr))
for (i in 1:length(splitrownames))
{
rownames(matr)[i] = strsplit(rownames(matr),"_")[[i]][1]
}
Reading in the metadata
metadata = read.csv("metadata.11.26.24.csv",header=T)
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
shared = intersect(rownames(matr),metadata[,1])
matr = matr[match(shared,rownames(matr)),]
metadata = metadata[match(shared,metadata[,1]),]
Computing NMDS ordinations
# Step 1: Compute Bray-Curtis Dissimilarity Matrix
beta_dissimilarity <- vegdist(matr, method = "bray")
# Step 2: Perform PCoA
pcoa_result <- cmdscale(beta_dissimilarity, k = 2, eig = TRUE) # k = 2 for 2D
# Step 3: Prepare Data for ggplot
pcoa_data <- data.frame(
PCoA1 = pcoa_result$points[, 1],
PCoA2 = pcoa_result$points[, 2],
bv_both_none = metadata$bv_both_none
)
# Step 4: Plot PCoA Results
ggplot(pcoa_data, aes(x = PCoA1, y = PCoA2, color = factor(bv_both_none))) +
geom_point(size = 3) +
labs(
title = "PCoA Ordination (Bray-Curtis)",
x = "PCoA Dimension 1",
y = "PCoA Dimension 2",
color = "BV Category"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.title = element_text(size = 12)
)
Comparing alpha-diversity across groups
# Compute alpha diversity metrics
alpha_diversity <- data.frame(
Shannon = diversity(matr, index = "shannon"), # Shannon index
Simpson = diversity(matr, index = "simpson"), # Simpson index
Richness = rowSums(matr > 0), # Richness (number of non-zero species)
bv_both_none = metadata$bv_both_none # Add grouping information
)
# Ensure bv_both_none is treated as a factor
alpha_diversity$bv_both_none <- as.factor(alpha_diversity$bv_both_none)
# Plot alpha diversity (Shannon index) by group
ggplot(alpha_diversity, aes(x = bv_both_none, y = Shannon, fill = bv_both_none)) +
geom_boxplot() +
labs(
title = "Alpha Diversity (Shannon Index)",
x = "BV Category",
y = "Shannon Diversity"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "none"
)
Statistical comparisons
# Perform Kruskal-Wallis test
kruskal_result <- kruskal.test(Shannon ~ bv_both_none, data = alpha_diversity)
print(kruskal_result)
##
## Kruskal-Wallis rank sum test
##
## data: Shannon by bv_both_none
## Kruskal-Wallis chi-squared = 7.0048, df = 4, p-value = 0.1356
# Post-hoc pairwise comparisons (Dunn's test)
library(FSA) # For Dunn's test
dunn_result <- dunnTest(Shannon ~ bv_both_none, data = alpha_diversity, method = "bh")
print(dunn_result)
## Comparison Z P.unadj P.adj
## 1 0 - 1 0.9040555 0.36596600 0.5228086
## 2 0 - 2 2.0119384 0.04422643 0.2211321
## 3 1 - 2 1.3027301 0.19266692 0.3853338
## 4 0 - 3 2.0429169 0.04106066 0.4106066
## 5 1 - 3 1.2362064 0.21638186 0.3606364
## 6 2 - 3 -0.4341663 0.66416772 0.6641677
## 7 0 - 4 1.9575507 0.05028276 0.1676092
## 8 1 - 4 1.4140971 0.15733339 0.3933335
## 9 2 - 4 0.5438673 0.58653282 0.6517031
## 10 3 - 4 0.8494120 0.39565209 0.4945651
detach("package:FSA", unload = TRUE)
Now, comparing between group 0 (no pathogen detected) and groups 1-3 (pathogens detected)
# Create a binary grouping variable
alpha_diversity$group_binary <- ifelse(alpha_diversity$bv_both_none == 0, "Group 0", "Groups 1-3")
wilcox_result <- wilcox.test(Shannon ~ group_binary, data = alpha_diversity)
print(wilcox_result)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Shannon by group_binary
## W = 2570, p-value = 0.04197
## alternative hypothesis: true location shift is not equal to 0
# Boxplot with binary groups
ggplot(alpha_diversity, aes(x = group_binary, y = Shannon, fill = group_binary)) +
geom_boxplot() +
labs(
title = "Comparison of Group 0 vs. Groups 1-3",
x = "Group",
y = "Shannon Diversity"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5)
)
group0 = which(metadata$bv_both_none == 0)
group1 = which(metadata$bv_both_none == 1)
group2 = which(metadata$bv_both_none == 2)
group3 = which(metadata$bv_both_none == 3)
# group1 (bacterial) vs group0 (none) comparison
pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))
for (i in 1:ncol(matr))
{
pvals[i] = wilcox.test(matr[group1,i],matr[group0,i])$p.value
fc[i] = log((mean(matr[group1,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}
cols = vector(length = ncol(matr))
cols[] = "gray"
cols[which(colnames(matr) == "Haemophilus influenzae")] = "green"
cols[which(colnames(matr) == "Moraxella catarrhalis")] = "red"
cols[which(colnames(matr) == "Streptococcus pneumoniae")] = "purple"
pvals[which(is.nan(pvals))] = 1
plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)
colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
## [1] "[Haemophilus] ducreyi" "Haemophilus haemolyticus"
## [3] "Haemophilus influenzae" "Moraxella catarrhalis"
## [5] "Moraxella nonliquefaciens" "Streptococcus pneumoniae"
# group2 (viral) vs group0 (none) comparison
pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))
for (i in 1:ncol(matr))
{
pvals[i] = wilcox.test(matr[group2,i],matr[group0,i])$p.value
fc[i] = log((mean(matr[group2,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}
cols = vector(length = ncol(matr))
cols[] = "gray"
cols[which(colnames(matr) == "Influenza A virus")] = "blue"
cols[which(colnames(matr) == "Rhinovirus A")] = "orange"
pvals[which(is.nan(pvals))] = 1
plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)
colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
## [1] "Influenza A virus" "Rhinovirus A"
# group3 (viral) vs group0 (none) comparison
pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))
for (i in 1:ncol(matr))
{
pvals[i] = wilcox.test(matr[group3,i],matr[group0,i])$p.value
fc[i] = log((mean(matr[group3,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}
cols = vector(length = ncol(matr))
cols[] = "gray"
cols[which(colnames(matr) == "Haemophilus influenzae")] = "green"
cols[which(colnames(matr) == "Moraxella catarrhalis")] = "red"
cols[which(colnames(matr) == "Streptococcus pneumoniae")] = "purple"
cols[which(colnames(matr) == "Influenza A virus")] = "blue"
cols[which(colnames(matr) == "Rhinovirus A")] = "orange"
pvals[which(is.nan(pvals))] = 1
plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)
colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
## [1] "Aggregatibacter actinomycetemcomitans"
## [2] "Giardia intestinalis"
## [3] "Haemophilus haemolyticus"
## [4] "Haemophilus influenzae"
## [5] "Haemophilus parahaemolyticus"
## [6] "Haemophilus parainfluenzae"
## [7] "Moraxella catarrhalis"
## [8] "Rhinovirus A"
## [9] "Rhodopseudomonas palustris"
## [10] "Streptococcus pneumoniae"
## [11] "Treponema denticola"
annotation_row = metadata[,c(22,24,26)]
annotation_row = (annotation_row -2) * -1 #just to make it going 0 to 1
rownames(annotation_row) = metadata[,1]
colnames(annotation_row) = c("SPN", "HFLU", "MCAT")
cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
"YlGnBu")))(99))
ann_colors = list(
SPN = c("white", "light gray"),
HFLU = c("white", "light gray"),
MCAT = c("white", "light gray")
)
pheatmap(log(matr[,c(grep("Moraxella cat", colnames(matr)),grep("Streptococcus pne", colnames(matr)),grep("Haemophilus inf", colnames(matr)))] + 1, 10),annotation_row=annotation_row[,c(2,1,3)],cluster_cols = F, show_rownames = FALSE,col=cols,annotation_colors = ann_colors)
metadata[which(is.na(metadata[,"mcat"])),"mcat"] = "TrueNEG"
group = metadata[,"mcat"]
group <- mapvalues(group,
from=c("TrueNEG","2","1"),
to=c("TrueNEG","-","+"))
group = as.factor(group)
group <- factor(group, levels = c("TrueNEG", "-", "+")) # Set the desired order
abund = log(matr[,"Moraxella catarrhalis"] + 1,10)
p = ggplot(data.frame(mcat = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("mcat") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
metadata[which(is.na(metadata[,"hflu"])),"hflu"] = "TrueNEG"
group = metadata[,"hflu"]
group <- mapvalues(group,
from=c("TrueNEG","2","1"),
to=c("TrueNEG","-","+"))
group = as.factor(group)
group <- factor(group, levels = c("TrueNEG", "-", "+")) # Set the desired order
abund = log(matr[,"Haemophilus influenzae"] + 1,10)
p = ggplot(data.frame(hflu = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("hflu") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
metadata[which(is.na(metadata[,"spn"])),"spn"] = "TrueNEG"
group = metadata[,"spn"]
group <- mapvalues(group,
from=c("TrueNEG","2","1"),
to=c("TrueNEG","-","+"))
group = as.factor(group)
group <- factor(group, levels = c("TrueNEG", "-", "+")) # Set the desired order
abund = log(matr[,"Streptococcus pneumoniae"] + 1,10)
p = ggplot(data.frame(spn = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("spn") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]
bacterial_detection_threshold = 3 # gives a good balance of sensitivity and specificity
# number of HFLU detected
hflu = which(matr.Valid[,"Haemophilus influenzae"] >= bacterial_detection_threshold)
length(hflu)
## [1] 81
# number of SPN detected
spn = which(matr.Valid[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
length(spn)
## [1] 73
# number of MCAT detected
mcat = which(matr.Valid[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
length(mcat)
## [1] 137
#any of the above three
length(unique(c(hflu,spn,mcat)))
## [1] 177
#two or more
length(unique(c(intersect(hflu,mcat),intersect(hflu,spn),intersect(spn,mcat))))
## [1] 89
#all three
length(intersect(intersect(hflu,mcat),spn))
## [1] 25
values = unique(rev(sort(matr.Valid[,"Moraxella catarrhalis"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr[,"Moraxella catarrhalis"] >= k)
TPR[i] = length(intersect(matches,which(metadata.Valid[,"mcat"] == 1))) / length(which(metadata.Valid[,"mcat"] == 1))
FPR[i] = length(intersect(matches,which(metadata.Valid[,"mcat"] == 2))) / length(which(metadata.Valid[,"mcat"] == 2))
}
prediction = matr.Valid[,"Moraxella catarrhalis"]
response = as.numeric(metadata.Valid[,"mcat"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.823844
auc_mcat = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("mcat","AUC = ",round(auc_mcat,2)))
matches = which(matr.Valid[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"mcat"] == 1))) / length(which(metadata.Valid[,"mcat"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"mcat"] == 2))) / length(which(metadata.Valid[,"mcat"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("mcat",tpr0 * 100,100-(fpr0 * 100)))
## [1] "mcat" "84.7457627118644" "64.0776699029126"
sens_mcat = tpr0 * 100
spec_mcat = 100-(fpr0 * 100)
metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]
values = unique(rev(sort(matr.Valid[,"Streptococcus pneumoniae"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr.Valid[,"Streptococcus pneumoniae"] >= k)
TPR[i] = length(intersect(matches,which(metadata.Valid[,"spn"] == 1))) / length(which(metadata.Valid[,"spn"] == 1))
FPR[i] = length(intersect(matches,which(metadata.Valid[,"spn"] == 2))) / length(which(metadata.Valid[,"spn"] == 2))
}
prediction = matr.Valid[,"Streptococcus pneumoniae"]
response = as.numeric(metadata.Valid[,"spn"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.8915326
auc_spn = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("spn","AUC = ",round(auc_spn,2)))
matches = which(matr.Valid[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"spn"] == 1))) / length(which(metadata.Valid[,"spn"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"spn"] == 2))) / length(which(metadata.Valid[,"spn"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("spn",tpr0 * 100,100-(fpr0 * 100)))
## [1] "spn" "81.4285714285714" "89.4039735099338"
sens_spn = tpr0 * 100
spec_spn = 100-(fpr0 * 100)
metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]
values = unique(rev(sort(matr.Valid[,"Haemophilus influenzae"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr.Valid[,"Haemophilus influenzae"] >= k)
TPR[i] = length(intersect(matches,which(metadata.Valid[,"hflu"] == 1))) / length(which(metadata.Valid[,"hflu"] == 1))
FPR[i] = length(intersect(matches,which(metadata.Valid[,"hflu"] == 2))) / length(which(metadata.Valid[,"hflu"] == 2))
}
prediction = matr.Valid[,"Haemophilus influenzae"]
response = as.numeric(metadata.Valid[,"hflu"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.9549669
auc_hflu = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("hflu","AUC = ",round(auc_hflu,2)))
matches = which(matr.Valid[,"Haemophilus influenzae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"hflu"] == 1))) / length(which(metadata.Valid[,"hflu"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"hflu"] == 2))) / length(which(metadata.Valid[,"hflu"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("hflu",tpr0 * 100,100-(fpr0 * 100)))
## [1] "hflu" "94.2857142857143" "90.0662251655629"
sens_hflu = tpr0 * 100
spec_hflu = 100-(fpr0 * 100)
accuracies = cbind(sens = c(mcat = sens_mcat,spn = sens_spn,hflu = sens_hflu),spec = c(mcat = spec_mcat,spec_spn,spec_hflu),auc = c(auc_mcat, auc_spn, auc_hflu))
kable(accuracies)
| sens | spec | auc | |
|---|---|---|---|
| mcat | 84.74576 | 64.07767 | 0.8238440 |
| spn | 81.42857 | 89.40397 | 0.8915326 |
| hflu | 94.28571 | 90.06623 | 0.9549669 |
To explore, we will first subset the data to all viruses (species with “virus” in their name) with greater than log10(rpm) +1 > 0.3 (roughly 1 read per million), as well as any species with log10+1 abundance greater than 3 since this threshold includes our three bacterial sinusitis pathogens (SPN, HFU, MCAT).
matr.subset = matr.Valid[,unique(c(which(apply(log(matr.Valid + 1,10),2,max) > 3),intersect(grep("virus",colnames(matr.Valid)),which(apply(log(matr.Valid + 1,10),2,max) > 0.3))))]
pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"
par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3)
pheatmap(t(log(matr.Valid[,names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5)
The above list of species was then investigated further using BLAST and literature searches to identify a smaller subset of pathogens of interest.
pathogensOfInterest = c("Influenza A virus", "Influenza C virus", "Haemophilus influenzae", "Human coronavirus NL63", "Influenza B virus", "Betacoronavirus 1", "Respiratory syncytial virus", "Human coronavirus HKU1", "Human respirovirus 3", "Human orthopneumovirus", "Moraxella nonliquefaciens", "Fusobacterium nucleatum", "Human mastadenovirus C", "Human metapneumovirus", "Moraxella catarrhalis", "Prevotella intermedia", "Metamycoplasma salivarium", "Streptococcus pneumoniae", "Pasteurella multocida", "Human orthorubulavirus 4", "Tannerella forsythia", "Corynebacterium pseudodiphtheriticum", "Moraxella osloensis", "Enterovirus B", "Mycoplasma pneumoniae", "Enterovirus A", "Human orthorubulavirus 2", "Chlamydia pneumoniae", "Treponema medium", "Human coronavirus 229E", "Rhinovirus C", "Enterovirus D", "Rhinovirus A", "Human respirovirus 1", "Murine leukemia virus", "Rhinovirus B", "Human mastadenovirus B", "Human gammaherpesvirus 4", "Human betaherpesvirus 5", "Mamastrovirus 9", "Cardiovirus B", "Betapolyomavirus quartihominis", "Parechovirus A")
matr.subset = matr[,pathogensOfInterest]
pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"
par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3,cex.names=0.5)
#plot the pathogen-positive samples (basedon culture/PCR)
breaksList = seq(0, 5, by = 1)
cols = c("white",colorRampPalette(brewer.pal(n = 4, name="YlGnBu"))(4))
temp.matr = t(log(matr[which(apply(metadata[,c(27,64)],1,sum) != 0),names(pathTable)] + 1,10))
temp.matr[temp.matr > 5] = 5 # adds ceiling to data to make color range consistent with next plot
pheatmap(temp.matr,cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color =rev(cols), breaks=breaksList)
#plot the pathogen-negative samples (basedon culture/PCR)
pheatmap(t(log(matr[which(apply(metadata[,c(27,64)],1,sum) == 0),names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color = rev(cols),breaks=breaksList)
genesymbols = read.delim("host-expression/gencode.v39.metadata.HGNC")
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
metadata[,"batch"] = as.factor(metadata[,"batch"])
metadata[,"bv_both_none"] = as.factor(metadata[,"bv_both_none"])
metadata[which(as.numeric(metadata[,"cum_pathogn"]) > 1),"cum_pathogn"] = 1
metadata[,"cum_pathogn"] = as.factor(metadata[,"cum_pathogn"])
metadata[,"sex"] = as.factor(metadata[,"sex"])
metadata$age_scaled = scale(metadata$Age.in.years)
onlyvirusonlybacteria_samples = metadata[which((metadata$bv_both_none == 1) | (metadata$bv_both_none == 2)),]
onlyvirusonlybacteria_samples = onlyvirusonlybacteria_samples %>% arrange(cum_pathogn)
subsetfiles <- file.path("host-expression/quants", onlyvirusonlybacteria_samples$Filename, "quant.sf")
subset_txi.salmon <- tximport(subsetfiles, type = "salmon", tx2gene = genesymbols)
dds <- DESeqDataSetFromTximport(subset_txi.salmon, onlyvirusonlybacteria_samples, ~ batch + sex + age_scaled + cum_pathogn)
dds <- DESeq(dds)
res <- results(dds)
summary(res)
##
## out of 31126 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2944, 9.5%
## LFC < 0 (down) : 1891, 6.1%
## outliers [1] : 0, 0%
## low counts [2] : 8333, 27%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
pairwise_genes = which(res$padj < 0.001)
bac_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange > 0))
viral_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange < 0))
There are 548 bacterial upDEGs and 273 viral upDEGs.
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.001,
FCcutoff = 0,
xlim = c(-7, 7),
ylim=c(-1,27),
pointSize = 1.5,
labSize = 2.5,
title = 'DESeq2 results',
subtitle = 'Differential expression',
caption = 'p-value cutoff, 0.001',
legendPosition = "none",
legendLabSize = 14,
col = c('grey30', 'light gray', 'royalblue', 'red2'),
colAlpha = 0.9,
drawConnectors = FALSE,
widthConnectors = 0.5)
Read in full set of files, including those with NO pathogens and BOTH (virus and bacteria)
files <- file.path("host-expression/quants", metadata$Filename, "quant.sf")
txi.salmon <- tximport(files, type = "salmon", tx2gene = genesymbols)
pdds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~ batch)
prld <- varianceStabilizingTransformation(pdds, blind = TRUE)
These are plots of host gene expression per clinical group
pathOrNot = as.numeric(metadata[,"bv_both_none"]) # using clinical categories
genelist = c("CXCL10", "PRF1", "IFI27", "CCL8", "PTGS2", "S100A9", "PLAUR", "TNFAIP3","IL1B")
for (i in 1:length(genelist)) {
gene = genelist[i]
print(gene)
p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
geom_boxplot(outlier.shape=NA) +
ggtitle(gene) +
theme(axis.title = element_blank(), legend.position="none") +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
labs(fill = "Infection type")
assign(paste0("p", i), p)
}
## [1] "CXCL10"
## [1] "PRF1"
## [1] "IFI27"
## [1] "CCL8"
## [1] "PTGS2"
## [1] "S100A9"
## [1] "PLAUR"
## [1] "TNFAIP3"
## [1] "IL1B"
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + theme(legend.position="right") + plot_layout(nrow = 3)
Setting up data for sorted heatmaps
metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
#setting up a temp matrix with Z-score scaled expression levels
temp = t(scale(t(assay(prld[,which(metadata[,"batch"] != 6)]))))
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0
annotation_col = data.frame(
Category = metadata.Valid$high_pathogens,
Viral_pathogen_abundance = metadata.Valid[,"viral_quantile"],
Bacterial_pathogen_abundance = metadata.Valid[,"threebac_quantile"],
Symptom_severity_score = metadata.Valid[,"PRSS"],
Days_with_cold = metadata.Valid[,"days_with_cold"],
Infection = factor(metadata.Valid[,"bv_both_none"], labels = c("None","Bacterial", "Viral", "Both"))
)
heatmap <- pheatmap(temp[bac_upDEGs,order(bacPathAbundance[-which(metadata$batch ==6)])],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "green", "Yes" = "red")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
heatmap1 <- pheatmap(temp[viral_upDEGs,order(metadata.Valid$high_pathogen)],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "white", "Yes" = "black")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
heatmap2 <- pheatmap(temp[bac_upDEGs,order(metadata.Valid$high_pathogen)],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "white", "Yes" = "black")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
temp = t(scale(t(assay(prld[,]))))
viralResponseScore = apply(temp[viral_upDEGs,],2,mean)
bacterialResponseScore = apply(temp[bac_upDEGs,],2,mean)
group = as.numeric(metadata.Valid[,"threebac_quantile"])
p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
labs(fill = "Pathogen abundance") +
labs(x = "Bacterial pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") +
theme(legend.position = "none")
p
group = as.numeric(metadata[,"viral_quantile"])
p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
labs(fill = "Pathogen abundance") +
labs(x = "Viral pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)") +
theme(legend.position = "none")
p
group = as.numeric(metadata[,"high_pathogens"])
p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)")
p
group = as.numeric(metadata[,"high_pathogens"])
p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
p
### Now examining against the clinical categories
group = as.numeric(metadata$bv_both_none)
p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)")
p
group = as.numeric(metadata$bv_both_none)
p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
p
pathOrNot = as.numeric(metadata[,"high_pathogens"]) # using categories "Both low" "Viral high" "Bacterial high" "Both high"
genelist = c("PTGS2", "S100A9", "PLAUR", "TNFAIP3", "SERPINA2", "CXCL2","IL1B", "CXCL11","IFNL1","CXCL10", "PRF1", "IFI27", "CCL8","OAS2")
for (i in 1:length(genelist)) {
gene = genelist[i]
p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
geom_boxplot(outlier.shape=NA) +
ggtitle(gene) +
theme(axis.title = element_blank(), legend.position="none") +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("None detected", "Viral", "Bacterial", "Both")) +
labs(fill = "Infection type")
assign(paste0("p", i), p)
}
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11 + p12 + p13 + p14 + plot_layout(nrow = 2)
Here, we will compute the correlation between the bacterial host response score and the total bacterial pathogen abundance (MCAT + SPN + HFLU), and compare this to 1000 randomly samples selections of three bacterial species from the data.
abundant_bacteria = setdiff(which(apply(matr,2,max) >= 10),grep("virus",colnames(matr)))
cors = vector(length = 1000)
for (i in 1:1000)
{
randSetOfThree = sample(abundant_bacteria,3)
threeBacAbundance = apply(matr[,randSetOfThree],1,sum) # sum of their individual RPMs
#ntile(threeBacAbundance, 10)
cors[i] = cor(bacterialResponseScore,log(threeBacAbundance + 1,10))
}
hist(cors,xlim=c(-1,1),breaks=100)
abline(v = cor(bacterialResponseScore, log(bacPathAbundance + 1,10)))
Viral ROC
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 1 | metadata.Valid$high_pathogen == 3)] = 1
prediction = viralResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.7963905
Bacterial ROC
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 2 | metadata.Valid$high_pathogen == 3)] = 1
prediction = bacterialResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.7786522
Clinical - Viral ROC
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 2 | metadata.Valid$bv_both_none == 3)] = 1
prediction = viralResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.7678653
Clinical - Bacterial ROC
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 1 | metadata.Valid$bv_both_none == 3)] = 1
prediction = bacterialResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.7802339
# Load the xCell data
xCell = read.delim("../xcell_results/xCell_tpm_matrix_xCell_0050111924_RAW.txt", sep = '\t', row.names = 1)
xCell_signif = read.delim("../xcell_results/xCell_tpm_matrix_xCell_0050111924.pvals.txt", sep = '\t', row.names = 1)
colnames(xCell) = 1:221
colnames(xCell_signif) = 1:221
# Mask non-significant values
xCell_filtered = xCell
xCell_filtered[xCell_signif >= 0.2] <- NA # Set non-significant values to NA
# Replace NA with a low value (e.g., 0)
xCell_filtered_noNA <- xCell_filtered
xCell_filtered_noNA[is.na(xCell_filtered_noNA)] <- 0
#remove rows with no variance
toRemove = which(apply(xCell_filtered_noNA,1,var) == 0)
xCell_filtered_noNA <- xCell_filtered_noNA[-toRemove,]
breaksList = seq(0, 6000, by = 100)
cols = rev(c(colorRampPalette(brewer.pal(n = 4, name="YlGnBu"))(4)))
# Plot the heatmap
pheatmap(xCell_filtered_noNA, scale = "none", na_col = "white", main = "xCell Heatmap (Significant Values Only)", annotation_col = annotation_col,col=cols)
pvals = vector(length = nrow(xCell_filtered_noNA))
for (i in 1:nrow(xCell_filtered_noNA))
{ pvals[i] = kruskal.test(as.numeric(xCell_filtered_noNA[i,]) ~ as.factor(metadata.Valid$high_pathogens))$p.value
}
pheatmap(xCell_filtered_noNA[which(p.adjust((pvals)) < 0.1),order(annotation_col$Category)], annotation_col = annotation_col,cluster_cols=F,cluster_rows=F)
xCell_filtered_noNA <- xCell_filtered_noNA[which(p.adjust((pvals)) < 0.1),]
Testing the ability of xCell-derived cell quantification to classify samples
# bacteria
xCell_Bac <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Bac) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 2 | metadata.Valid$high_pathogen == 3)] = 1
prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Bac[i,1] = auc(response,prediction)
}
barplot((t(xCell_Bac[order(xCell_Bac),])),las=3,ylim=c(0,1),main="xCell prediction of high_bacterial category from RNA-seq")
print(paste("Top score is:",max(xCell_Bac[,1])))
## [1] "Top score is: 0.757501229709789"
# viruses
xCell_Vir <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Vir) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 1 | metadata.Valid$high_pathogen == 3)] = 1
prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Vir[i,1] = auc(response,prediction)
}
barplot((t(xCell_Vir[order(xCell_Vir),])),las=3,ylim=c(0,1),main="xCell prediction of high_viral category from RNA-seq")
print(paste("Top score is:",max(xCell_Vir[,1])))
## [1] "Top score is: 0.839704675963905"
# bacteria
xCell_Bac <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Bac) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 1 | metadata.Valid$bv_both_none == 3)] = 1
prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Bac[i,1] = auc(response,prediction)
}
barplot((t(xCell_Bac[order(xCell_Bac),])),las=3,ylim=c(0,1),main="xCell prediction of bacterial category from clinical culture")
print(paste("Top score is:",max(xCell_Bac[,1])))
## [1] "Top score is: 0.757426900584795"
# viruses
xCell_Vir <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Vir) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 2 | metadata.Valid$high_pathogen == 3)] = 1
prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Vir[i,1] = auc(response,prediction)
}
barplot((t(xCell_Vir[order(xCell_Vir),])),las=3,ylim=c(0,1),main="xCell prediction of viral category from clinical culture")
print(paste("Top score is:",max(xCell_Vir[,1])))
## [1] "Top score is: 0.752968617472434"
Ruling out date effect on bacterial pathogen abundance
x = as.Date(metadata$Date.NP.swab.collected,format="%d-%b-%y")
y = bacPathAbundance
data <- data.frame(
x_numeric = as.numeric(x), # Convert x to numeric
y_log = log(y + 1, 10) # Apply log transformation to y
)
# Plot using ggplot2
ggplot(data, aes(x = x_numeric, y = y_log)) +
geom_point(color = "blue", size = 3) + # Scatter points
geom_smooth(method = "lm", color = "red", se = TRUE) + # Add trendline with confidence interval
labs(
title = "Scatter Plot with Trendline",
x = "Numeric X (e.g., days since 1970-01-01)",
y = "Log-transformed Y (log10(y + 1))"
) +
theme_minimal()
cor(as.numeric(x),log(y + 1,10))
## [1] NA
cor.test(as.numeric(x),log(y + 1,10))
##
## Pearson's product-moment correlation
##
## data: as.numeric(x) and log(y + 1, 10)
## t = 0.35519, df = 219, p-value = 0.7228
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1083199 0.1554733
## sample estimates:
## cor
## 0.02399436
Ruling out effect on viral pathogen abundance
x = as.Date(metadata$Date.NP.swab.collected,format="%d-%b-%y")
y = viralPathAbundance
data <- data.frame(
x_numeric = as.numeric(x), # Convert x to numeric
y_log = log(y + 1, 10) # Apply log transformation to y
)
# Plot using ggplot2
ggplot(data, aes(x = x_numeric, y = y_log)) +
geom_point(color = "blue", size = 3) + # Scatter points
geom_smooth(method = "lm", color = "red", se = TRUE) + # Add trendline with confidence interval
labs(
title = "Scatter Plot with Trendline",
x = "Numeric X (e.g., days since 1970-01-01)",
y = "Log-transformed Y (log10(y + 1))"
) +
theme_minimal()
cor(as.numeric(x),log(y + 1,10))
## [1] NA
cor.test(as.numeric(x),log(y + 1,10))
##
## Pearson's product-moment correlation
##
## data: as.numeric(x) and log(y + 1, 10)
## t = 0.8874, df = 219, p-value = 0.3758
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07268811 0.19032512
## sample estimates:
## cor
## 0.05985734
Ruling out batch effect on bacterial abundance
# Prepare the data for ggplot2
data <- data.frame(
log_bacPathAbundance = log(bacPathAbundance + 1, 10),
batch = metadata$batch
)
# Create the boxplot using ggplot2
ggplot(data, aes(x = batch, y = log_bacPathAbundance)) +
geom_boxplot(outlier.color = "red", outlier.size = 2) + # Boxplot with outlier styling
labs(
x = NULL, # No x-axis label
y = "log10(Pathogen Abundance)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1) # Adjust x-axis text angle if needed
)
Ruling out batch effect on viral pathogen abundance
# Prepare the data for ggplot2
data <- data.frame(
log_viralPathAbundance = log(viralPathAbundance + 1, 10),
batch = metadata$batch
)
# Create the boxplot using ggplot2
ggplot(data, aes(x = batch, y = log_viralPathAbundance)) +
geom_boxplot(outlier.color = "red", outlier.size = 2) + # Boxplot with outlier styling
labs(
x = NULL, # No x-axis label
y = "log10(Pathogen Abundance)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1) # Adjust x-axis text angle if needed
)
PCA plots
vst_matrix <- assay(prld) # Extract matrix from variance-stabilized object
# Perform PCA
pca <- prcomp(t(vst_matrix)) # Transpose required: genes as columns, samples as rows
# Prepare data for ggplot
pca_data <- data.frame(
PC1 = pca$x[, 1], # First principal component
PC2 = pca$x[, 2], # Second principal component
Batch = metadata$batch # Add batch information for coloring
)
# Create PCA plot
ggplot(pca_data, aes(x = PC1, y = PC2, color = Batch)) +
geom_point(size = 3) + # Plot points
labs(
title = "PCA Plot of Variance-Stabilized Data",
x = "PC1",
y = "PC2",
color = "Batch"
) +
theme_minimal() +
theme(
legend.position = "right",
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold")
)